In some applications it might be desired to pick a subset of the original features rather then find a mapping that uses all of the original features. The benefits of finding this subset of features could be in saving cost of computing unnecessary features, saving cost of sensors (in physical measurement systems), and in excluding noisy features while keeping their information using “clean” features (for example tracking points on a face using easy to track points- and inferring the other points based on those few measurements).
How does the algorithm look like:
class PFA(object):
def __init__(self, n_features, q=None):
self.q = q
self.n_features = n_features
def fit(self, X):
if not self.q:
self.q = X.shape[1]
sc = StandardScaler()
X = sc.fit_transform(X)
pca = PCA(n_components=self.q).fit(X)
A_q = pca.components_.T
kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
clusters = kmeans.predict(A_q)
cluster_centers = kmeans.cluster_centers_
dists = defaultdict(list)
for i, c in enumerate(clusters):
dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
dists[c].append((i, dist))
self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
self.features_ = X[:, self.indices_]
fig <- plot_ly(x = feature_list$X1,
y = feature_list$X0,
type = 'bar',
orientation = 'h'
) %>%
layout(yaxis = list(categoryorder = "total ascending"))
fig
## [1] "dissChannels.platf." "dissChannels.prof."
## [3] "dissChannels.mono." "concepts.review."
## [5] "adoptByPolicyHow.SQ002." "dissChannels.peer."
## [7] "dissChannels.policy." "concepts.pub."
## [9] "natureOfInvolvement.busi." "dissChannels.web."
## [11] "concepts3" "dissChannels.conf."
## [13] "adoptByPolicy.rate." "adoptByPolicyHow.SQ003."
## [15] "dissChannels.trad." "dissChannels.socmed."
## [17] "dissChannels.consult." "dissChannels.events."
## [19] "dissChannels.public." "concepts.data."
## [21] "concepts.code." "concepts.infra."
## [23] "contribToSI.rate." "groupsInvolved.res."
## [25] "natureOfInvolvement.res." "contribToSI.rate."
## Loading required package: lattice
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
# --- FA explanatory
# Maximum Likelihood Factor Analysis
# entering raw data and extracting 3 factors,
# with varimax rotation
fit <- factanal(df_red, 8, rotation="varimax")
print(fit, digits=2, cutoff=.3, sort=TRUE)
##
## Call:
## factanal(x = df_red, factors = 8, rotation = "varimax")
##
## Uniquenesses:
## transdisciplinaryExp.rate. familiarWithSI.response.
## 0.76 0.66
## motivation.pheno. motivation.prob.
## 0.92 0.91
## motivation.welfare. benefitForNonAcademy
## 0.41 0.30
## impulseForNonAcad.soc. impulseForNonAcad.econ.
## 0.54 0.76
## impulseForNonAcad.ecol. impulseForNonAcad.health.
## 0.84 0.69
## impulseForNonAcad.tech. groupsInvolved.busi.
## 0.92 0.69
## groupsInvolved.civsoc. groupsInvolved.policy.
## 0.19 0.33
## groupsInvolved.citiz. groupsInvolved.media.
## 0.52 0.82
## groupsInvolved.welfare. natureOfInvolvement.civsoc.
## 0.35 0.37
## natureOfInvolvement.policy. natureOfInvolvement.citiz.
## 0.61 0.63
## natureOfInvolvement.media. natureOfInvolvement.welfare.
## 0.92 0.44
## targetGroupsGoals.socneeds. targetGroupsGoals.socgroups.
## 0.52 0.55
## targetGroupsGoals.improve. targetGroupsGoals.empower.
## 0.38 0.49
## targetGroupsGoals.diversity. concepts2
## 0.63 0.65
## impactTargetGroup.pub. impactTargetGroup.busi.
## 0.38 0.18
## impactTargetGroup.socgr. impactTargetGroup.welfare.
## 0.43 0.38
## impactTargetGroup.civsoc. impactTargetGroup.policy.
## 0.45 0.24
## impactTargetGroup.acad. kindOfChange.pub.
## 0.80 0.48
## kindOfChange.busi. kindOfChange.socgr.
## 0.33 0.52
## kindOfChange.welfare. kindOfChange.civsoc.
## 0.36 0.61
## kindOfChange.policy. kindOfChange.acad.
## 0.33 0.71
## adoptByPolicyHow.SQ001. Impactstatements.capab.
## 0.79 0.43
## Impactstatements.emanc. Impactstatements.understanding.
## 0.42 0.38
## Impactstatements.mitig. Impactstatements.unknown.
## 0.44 0.60
## Impactstatements.unaddressed. scalabilityRating.up.
## 0.77 0.42
## scalabilityRating.out. scalabilityRating.deep.
## 0.41 0.33
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## impulseForNonAcad.soc. 0.57
## groupsInvolved.citiz. 0.51 0.34
## targetGroupsGoals.socneeds. 0.53 0.34
## targetGroupsGoals.socgroups. 0.57
## impactTargetGroup.socgr. 0.57
## Impactstatements.capab. 0.52 0.33
## Impactstatements.emanc. 0.64
## Impactstatements.understanding. 0.71
## Impactstatements.mitig. 0.59
## scalabilityRating.deep. 0.51
## motivation.welfare. 0.67
## benefitForNonAcademy 0.71
## targetGroupsGoals.improve. 0.72
## groupsInvolved.policy. 0.70
## impactTargetGroup.policy. 0.66
## kindOfChange.policy. 0.56
## groupsInvolved.welfare. 0.30 0.69
## natureOfInvolvement.welfare. 0.33 0.63
## impactTargetGroup.welfare. 0.60
## kindOfChange.welfare. 0.66
## groupsInvolved.civsoc. 0.36 0.77
## natureOfInvolvement.civsoc. 0.76
## impactTargetGroup.civsoc. 0.34 0.53
## impactTargetGroup.busi. 0.84
## kindOfChange.busi. 0.76
## scalabilityRating.up. 0.35
## scalabilityRating.out. 0.30
## kindOfChange.pub.
## transdisciplinaryExp.rate. 0.33
## familiarWithSI.response. 0.43
## motivation.pheno.
## motivation.prob.
## impulseForNonAcad.econ. 0.33
## impulseForNonAcad.ecol.
## impulseForNonAcad.health. 0.47
## impulseForNonAcad.tech.
## groupsInvolved.busi. 0.50
## groupsInvolved.media.
## natureOfInvolvement.policy. 0.49
## natureOfInvolvement.citiz. 0.43 0.36
## natureOfInvolvement.media.
## targetGroupsGoals.empower. 0.45 0.30
## targetGroupsGoals.diversity. 0.43
## concepts2 0.49
## impactTargetGroup.pub. 0.44 0.35
## impactTargetGroup.acad.
## kindOfChange.socgr. 0.48
## kindOfChange.civsoc. 0.45
## kindOfChange.acad.
## adoptByPolicyHow.SQ001.
## Impactstatements.unknown. 0.50
## Impactstatements.unaddressed. 0.33
## Factor7 Factor8
## impulseForNonAcad.soc.
## groupsInvolved.citiz.
## targetGroupsGoals.socneeds.
## targetGroupsGoals.socgroups.
## impactTargetGroup.socgr.
## Impactstatements.capab.
## Impactstatements.emanc.
## Impactstatements.understanding.
## Impactstatements.mitig.
## scalabilityRating.deep. 0.50
## motivation.welfare.
## benefitForNonAcademy
## targetGroupsGoals.improve.
## groupsInvolved.policy.
## impactTargetGroup.policy. 0.36
## kindOfChange.policy. 0.47
## groupsInvolved.welfare.
## natureOfInvolvement.welfare.
## impactTargetGroup.welfare. 0.34
## kindOfChange.welfare. 0.32
## groupsInvolved.civsoc.
## natureOfInvolvement.civsoc.
## impactTargetGroup.civsoc.
## impactTargetGroup.busi.
## kindOfChange.busi.
## scalabilityRating.up. 0.54
## scalabilityRating.out. 0.53
## kindOfChange.pub. 0.62
## transdisciplinaryExp.rate.
## familiarWithSI.response.
## motivation.pheno.
## motivation.prob.
## impulseForNonAcad.econ.
## impulseForNonAcad.ecol.
## impulseForNonAcad.health.
## impulseForNonAcad.tech.
## groupsInvolved.busi.
## groupsInvolved.media.
## natureOfInvolvement.policy.
## natureOfInvolvement.citiz.
## natureOfInvolvement.media.
## targetGroupsGoals.empower.
## targetGroupsGoals.diversity.
## concepts2
## impactTargetGroup.pub. 0.41
## impactTargetGroup.acad. 0.35
## kindOfChange.socgr. 0.42
## kindOfChange.civsoc.
## kindOfChange.acad. 0.43
## adoptByPolicyHow.SQ001.
## Impactstatements.unknown.
## Impactstatements.unaddressed.
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7 Factor8
## SS loadings 6.56 2.96 2.75 2.72 2.64 2.41 2.23 1.74
## Proportion Var 0.13 0.06 0.05 0.05 0.05 0.05 0.04 0.03
## Cumulative Var 0.13 0.18 0.24 0.29 0.34 0.39 0.43 0.46
##
## Test of the hypothesis that 8 factors are sufficient.
## The chi square statistic is 2419.95 on 938 degrees of freedom.
## The p-value is 2.03e-131
# plot factor 1 by factor 2
#load <- fit$loadings[,1:2]
#plot(load,type="n") # set up plot
#text(load,labels=names(df_red),cex=.7) # add variable names
#fit$loadings
model_theory <-"
## SI Familiarity
fam =~ familiarWithSI.response.+transdisciplinaryExp.rate.
## intention_agency
ia_human_condition =~ motivation.welfare.+benefitForNonAcademy+impulseForNonAcad.soc.+targetGroupsGoals.improve.+impulseForNonAcad.health.+impulseForNonAcad.ecol.
ia_non_academic =~ impulseForNonAcad.econ.+impulseForNonAcad.tech.
## transdisciplinary_apects
transdisciplinary_social =~ groupsInvolved.citiz.+groupsInvolved.civsoc.+groupsInvolved.welfare.+natureOfInvolvement.citiz.+natureOfInvolvement.civsoc.+natureOfInvolvement.welfare.+targetGroupsGoals.socneeds.+targetGroupsGoals.socgroups.+targetGroupsGoals.empower.+targetGroupsGoals.diversity.
## outcome
outcome_public =~ impactTargetGroup.pub.+impactTargetGroup.socgr.+impactTargetGroup.welfare.+impactTargetGroup.civsoc.+kindOfChange.pub.+kindOfChange.socgr.+kindOfChange.welfare.+kindOfChange.civsoc.
outcome_statement =~ Impactstatements.capab.+Impactstatements.emanc.+Impactstatements.understanding.+Impactstatements.mitig.+Impactstatements.unknown.+Impactstatements.unaddressed.
## MISC:scalability
scale =~ scalabilityRating.up.+scalabilityRating.out.+scalabilityRating.deep.
## MISC:policy
policy =~ groupsInvolved.policy.+impactTargetGroup.policy.+kindOfChange.policy.+natureOfInvolvement.policy.+groupsInvolved.policy.+adoptByPolicyHow.SQ001.
## MISC:business
busi =~ groupsInvolved.busi.+impactTargetGroup.busi.+kindOfChange.busi.
"
model_efa <- "
f1 =~ impulseForNonAcad.soc.+groupsInvolved.citiz.+targetGroupsGoals.socneeds.+targetGroupsGoals.socgroups.+impactTargetGroup.socgr.+Impactstatements.capab.+Impactstatements.emanc.+Impactstatements.understanding.+Impactstatements.mitig.+transdisciplinaryExp.rate.+familiarWithSI.response.
f2 =~ motivation.welfare.+benefitForNonAcademy+targetGroupsGoals.improve.+impulseForNonAcad.health.+impactTargetGroup.pub.
f3 =~ groupsInvolved.policy.+impactTargetGroup.policy.+kindOfChange.policy.+natureOfInvolvement.policy.+impulseForNonAcad.econ.+natureOfInvolvement.policy.
f4 =~ groupsInvolved.welfare.+natureOfInvolvement.welfare.+impactTargetGroup.welfare.+kindOfChange.welfare.
f5 =~ groupsInvolved.civsoc.+natureOfInvolvement.civsoc.+impactTargetGroup.civsoc.+kindOfChange.civsoc.
f6 =~ impactTargetGroup.busi.+kindOfChange.busi.+groupsInvolved.busi.
f7 =~ scalabilityRating.up.+scalabilityRating.out.+scalabilityRating.deep.
f8 =~ kindOfChange.pub.+kindOfChange.socgr.+kindOfChange.acad.
"
## Warning in lav_object_post_check(object): lavaan WARNING: some estimated lv
## variances are negative
modificationindices(fit_theory) %>% arrange(-mi) %>% head(10)
## lhs op rhs mi epc
## 1 outcome_public =~ transdisciplinaryExp.rate. 248.724 4.705
## 2 outcome_statement =~ transdisciplinaryExp.rate. 229.872 3.208
## 3 transdisciplinary_social =~ transdisciplinaryExp.rate. 195.396 15.619
## 4 familiarWithSI.response. ~~ benefitForNonAcademy 174.404 -1.477
## 5 scale =~ transdisciplinaryExp.rate. 146.331 2.080
## 6 groupsInvolved.civsoc. ~~ natureOfInvolvement.civsoc. 144.541 0.098
## 7 groupsInvolved.welfare. ~~ natureOfInvolvement.welfare. 131.410 0.151
## 8 familiarWithSI.response. ~~ impactTargetGroup.busi. 121.142 -6.324
## 9 impactTargetGroup.civsoc. ~~ kindOfChange.civsoc. 116.426 0.556
## 10 ia_human_condition =~ transdisciplinaryExp.rate. 112.005 2.508
## sepc.lv sepc.all sepc.nox
## 1 8.837 2.895 2.895
## 2 7.442 2.438 2.438
## 3 6.745 2.209 2.209
## 4 -1.477 -1.772 -1.772
## 5 6.067 1.987 1.987
## 6 0.098 0.663 0.663
## 7 0.151 0.645 0.645
## 8 -6.324 -2.907 -2.907
## 9 0.556 0.607 0.607
## 10 6.439 2.109 2.109
modificationindices(fit_efa) %>% arrange(-mi) %>% head(10)
## lhs op rhs mi epc
## 1 groupsInvolved.civsoc. ~~ natureOfInvolvement.civsoc. 114.737 0.076
## 2 groupsInvolved.welfare. ~~ natureOfInvolvement.welfare. 85.547 0.126
## 3 groupsInvolved.policy. ~~ natureOfInvolvement.policy. 78.328 0.159
## 4 impactTargetGroup.socgr. ~~ kindOfChange.socgr. 71.901 0.514
## 5 impactTargetGroup.civsoc. ~~ kindOfChange.civsoc. 66.679 0.385
## 6 f1 =~ scalabilityRating.deep. 65.994 7.957
## 7 f6 =~ impactTargetGroup.pub. 64.042 0.470
## 8 f6 =~ Impactstatements.capab. 58.973 0.415
## 9 impactTargetGroup.welfare. ~~ kindOfChange.welfare. 55.633 0.554
## 10 f7 =~ impactTargetGroup.pub. 49.782 0.503
## sepc.lv sepc.all sepc.nox
## 1 0.076 0.739 0.739
## 2 0.126 0.911 0.911
## 3 0.159 0.522 0.522
## 4 0.514 0.521 0.521
## 5 0.385 0.690 0.690
## 6 1.893 0.610 0.610
## 7 1.223 0.393 0.393
## 8 1.078 0.342 0.342
## 9 0.554 0.496 0.496
## 10 1.486 0.477 0.477
anova(fit_theory, fit_efa)
## Warning in lavTestLRT(object = object, ..., model.names = NAMES): lavaan
## WARNING: some models are based on a different set of observed variables
## Chi-Squared Difference Test
##
## Df AIC BIC Chisq Chisq diff Df diff Pr(>Chisq)
## fit_efa 637 37684 38089 2534.8
## fit_theory 909 42713 43203 3688.2 1153.4 272 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
DF.refined <- data.frame(predict(fit_theory))
## Warning in lav_object_post_check(object): lavaan WARNING: some estimated lv
## variances are negative
a <- scales::rescale(apply(DF.refined, mean, MARGIN = 1), to = c(0, 10))
b <- feat_df.num_o$contribToSI.rate.
c <- 1:length(a)
ab <- as.data.frame(cbind(a,b,c))
library(ggplot2)
ggplot(ab, aes(x = c)) +
geom_line(aes(y = a), color = "darkred") +
geom_line(aes(y = b), color="steelblue")
DF.refined2 <- data.frame(predict(fit_efa))
d <- scales::rescale(apply(DF.refined2, mean, MARGIN = 1), to = c(0, 10))
db <- as.data.frame(cbind(d,b,c))
ggplot(db, aes(x = c)) +
geom_line(aes(y = d), color = "darkred") +
geom_line(aes(y = b), color="steelblue")
var_prop_efa <- c(0.13, 0.06, 0.05, 0.05, 0.05, 0.05, 0.04, 0.03)
new_DF_refined2 <- as.data.frame( as.matrix(DF.refined2) %*% diag(var_prop_efa))
f <- scales::rescale(apply(new_DF_refined2, mean, MARGIN = 1), to = c(0, 10))
data <- data.frame(a,b,c,d,f)
fig <- plot_ly(data, x = ~c, y = ~a, name = 'theory', type = 'bar')
fig <- fig %>% add_trace(y = ~b, name = 'contribToSI.rate', type = 'bar')
fig <- fig %>% add_trace(y = ~d, name = 'efa', type = 'bar')
fig <- fig %>% add_trace(y = ~f, name = 'efa_scaled', type = 'bar')
fig
feature_list_red <- feature_list[!(feature_list$X0 %in% features_to_rm),]
feature_list_red$logX <- log(feature_list_red$X1)
df_red2 <- df_red
for (i in seq_along(feature_list_red$logX)) {
df_red2[, i] <- df_red2[, i] * feature_list_red$logX[i]
}
g <- scales::rescale(apply(df_red2, mean, MARGIN = 1), to = c(0, 10))
data$g <- g
library(plotly)
fig1 <- plot_ly(x = data$c, y = data$a, type = 'scatter', mode = 'lines', name="theory")
fig2 <- plot_ly(x = data$c, y = data$b, type = 'scatter', mode = 'lines', name="contribToSI")
fig3 <- plot_ly(x = data$c, y = data$d, type = 'scatter', mode = 'lines', name="efa")
fig4 <- plot_ly(x = data$c, y = data$f, type = 'scatter', mode = 'lines', name="efa_scaled")
fig5 <- plot_ly(x = data$c, y = data$g, type = 'scatter', mode = 'lines', name="log(pfa)")
fig <- subplot(fig1, fig2, fig3, fig4, fig5, nrows = 5) %>%
layout(title = list(text = ""),
plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
fig
mean(aggregate( a ~ b , data = data , sum , na.rm = TRUE )$a)
## [1] 85.35234
mean(aggregate( d ~ b , data = data , sum , na.rm = TRUE )$d)
## [1] 85.77354
mean(aggregate( f ~ b , data = data , sum , na.rm = TRUE )$f)
## [1] 87.05012
mean(aggregate( g ~ b , data = data , sum , na.rm = TRUE )$g)
## [1] 112.6411
mean(a-b)
## [1] 0.8943928
mean(d-b)
## [1] 0.9072271
mean(f-b)
## [1] 0.9461254
mean(g-b)
## [1] 1.725907